home *** CD-ROM | disk | FTP | other *** search
- #
- # Sample locale file for VDK 2.0.
- #
- $control: 1
- locale:
- {
- #
- # Description: Inherit attributes from a different locale. All keywords
- # in this lng file will overwrite any attributes from the parent
- # locale. This provides a type of "subclass" mechanism for locales. For
- # example, the French Canadian locale might inherit from the French
- # locale, but over-ride the upper and lower case tables.
- #
- # Default: The locale "english" which implements basic english
- # functionality in charset 850
- #
- # This keyword is optional
- inherit: "locale_name"
-
- #
- # Description: This gives the major and minor version number of the
- # the current locale. This information is stamped into collections
- # when they are created so that you can tell which version of a
- # locale the collection is created with.
- #
- # Default: The default version is 1.0
- #
- # These keywords are optional
- Major-Version: number
- Minor-Version: number
-
- #
- # Description: For numeric formatting, this tells the search engine to
- # use either a comma or a period as the decimal point.
- #
- # Default: period
- #
- # This keyword is optional
- decimal: "dot_or_comma"
-
- #
- # Description: Load in a library of routines to implement functional
- # callbacks for this locale. Fill in a normal DDA spec for a library.
- # The optional name specifies a name to use in the tokenizers section
- # below. (q.v.)
- #
- # Example:
- # driver: "DLL:french:InitFrenchLocale"
- #
- # This keyword is optional
- driver: "dda_spec" [ "name" ]
- ...
-
-
- #
- # Description: Flags that give an indication to the engine what settings
- # to use as the default for the current locale.
- #
- # Default: for each of the settings is "no"
- #
- locale-flags:
- {
- #
- # query parser flags -- these flags affect the way the default query
- # parser works
- #
-
- # Automatically turn on case-sensitive search when a capital letter
- # is found in the query string?
- NoAutoCase: yes/no
-
- # Use the locale's tokenizer (either the DLL function or the lex rules)
- # when tokenizing a blob of text in the query?
- # No will cause the engine to use its built-in tokenizer.
- QueryTok: yes/no
-
- # Automatically generate an expanded search when a punctuation character
- # is found in a search term? For example, "AT&T" is expanded to search
- # for a number of possible variations (tokenizations):
- #
- # <Any>(<Many><Stem>`AT&T`,<Many><Phrase>(<Many><Stem>`AT`,<Many><Stem>`T`),
- # <Many><Phrase>(<Many><Stem>`AT`,<Many><Stem>`&`,<Many><Stem>`T`))
- NoAutoPhrase: yes/no
-
- #
- # tokenization flags -- these flags affect the way text is tokenized
- # in this locale
- #
-
- # Use the engine's built-in 8bit lexer to tokenize any unprocessed
- # text chunks returned by this locale's functional tokenizer. If this
- # is on, the tokenize driver may defer tokenization of ASCII
- # passages to the engine by returning these passages as VdkBuffer
- # tokens. For example, a multibyte tokenizer can return sections of
- # embedded English text as buffer tokens, letting the engine tokenize
- # them with its built-in lexer.
- NeedAsciiTok: yes/no
- }
-
- #
- # Description: The following strings are used to identify a locale.
- #
- locale-def:
- {
- #
- # Description: Name of this locale IF and only if it is different from
- # the name of the subdirectory under common for this locale. This
- # keyword was designed to be used with the "default" locale.
- #
- # Default: name of current locale dir
- #
- # This keyword is optional
- name: "locale_name"
-
- #
- # Description: Name of this language. This should be a 2 letter string
- # in a ISO 639 coding.
- #
- # Default: language of parent locale (or english "en" when no parent
- # is explicitly specified.)
- #
- # This keyword is optional
- langName: "language_name"
-
- #
- # Description: Name of the country. This should be a 2 letter string
- # in a ISO 3166 coding.
- #
- # Default: country of parent locale (or "US" when no parent
- # is explicitly specified.)
- #
- # This keyword is optional
- country: "country_name"
-
- #
- # Description: Name of the character set for this locale. All strings
- # that this locale will manipulate are written in this charset, and all
- # tables and data in this lng file are written in this charset.
- #
- # Default: charset of parent locale (or 850 when no parent
- # is explicitly specified.)
- #
- # This keyword is optional
- charset: "charset_name"
-
- #
- # Description: name of the subdialect of the given language.
- #
- # Default: dialect of parent locale
- #
- # This keyword is optional
- dialect: "dialect_name"
-
- #
- # Description: name of the supplier of this locale. This string
- # can be any arbitrary string, including your company name.
- #
- # Default: supplier of parent locale
- #
- # This keyword is optional
- supplier: "supplier_name"
- }
-
- #
- # Description: Specify the tokenizers for various purposes.
- #
- # Equivalent: style.lex rules
- #
- # Default: the hard-wired 8bit lexer for English. If you do not specify
- # a tokenizer for any particular purpose, then it will use the tokenizer
- # you specify for the "default" purpose. If you don't specify the "default"
- # purpose, then it will use the built-in hard-wired 8bit lexer for English.
- #
- # WARNING! Tokenization is a very sensitive process. If you use different
- # tokenizers for different purposes, your highlighting is likely to be
- # off. DO NOT CHANGE TOKENIZERS UNLESS YOU REALLY KNOW WHAT YOU ARE DOING!
- # A good solution is to set only one tokenizer for the "stream" purposes
- # and possibly one for the "extract" purposes, as these are pretty
- # independent. The best solution for most accurate highlighting is to
- # use one tokenizer for all purposes. Set this using the "default"
- # purpose. (q.v.)
- #
- # This keyword is optional.
- tokenizers:
- {
- #
- # Description: name the tokenizer to be used for indexing the
- # document.
- #
- # WARNING! If this tokenizer is different than the "View" tokenizer,
- # you will VERY likely have your highlights be off. Do not set this
- # differently unless you know what you are doing. You may want to
- # change the "stream" tokenizer instead, which sets the tokenizer
- # for all of the "index", "view", and "dynamichl" purposes.
- Index: args
-
- #
- # Description: name the tokenizer to be used for viewing the
- # document using VdkDocStreamRead. (This includes viewing with
- # highlights from the full-text index.)
- #
- # WARNING! If this tokenizer is different than the "Index" tokenizer,
- # you will VERY likely have your highlights be off. Do not set this
- # differently unless you know what you are doing. You may want to
- # change the "stream" tokenizer instead, which sets the tokenizer
- # for all of the "index", "view", and "dynamichl" purposes.
- View: args
-
- #
- # Description: name the tokenizer to be used for parsing strings
- # in a BooleanPlus query.
- #
- # WARNING! If this tokenizer is different than the "Index" tokenizer,
- # then query words will be less likely to match words in the index
- # and your recall rate will be less. Do not change this tokenizer
- # unless you know what you are doing!
- Query: args
-
- #
- # Description: name the tokenizer to be used for tokenizing a
- # document that is being summarized.
- Summarize: args
-
- #
- # Description: name the tokenizer to be used for tokenizing a
- # document that is being used as a query-by-example document.
- QBE: args
-
- #
- # Description: name the tokenizer to be used for tokenizing a
- # free-text query.
- FTQP: args
-
- #
- # Description: name the tokenizer to be used for tokenizing a
- # document that is being highlighted dynamically.
- DynamicHL: args
-
- #
- # Description: short form for naming the tokenizer used for all
- # of the Summarize, FTQP, and QBE purposes
- Extract: args
-
- #
- # Description: short form for naming the tokenizer used for all
- # of the Index, View, and DynamicHL purposes
- Stream: args
-
- #
- # Description: name the tokenizer to be used as a default if
- # any of the above purposes are not explicitly set to a particular
- # tokenizer
- Default: args
-
- #
- # The arguments to the above can be one of the following:
- #
- # driver "driverName"
- # - get the functional tokenizer from the named driver
- #
- # HWLEX
- # - use the internal hard-wired lexer
- #
- # LEX
- # - a set of style.lex type of rules is included to specify
- # a new lexer
- # This is an 8bit table and can only be used for locales
- # that specify a character set that is of type singlebyte.
- # These lex rules have the same syntax as the style.lex file rules.
- # Note: If you are defining a multibyte locale, you must
- # provide the above functions in the locale driver.
- # If you are in a single byte locale, you can optionally write the
- # above functions in the driver, but a the table below is more
- # efficient.
- #
- # Example 1: for a knowledgable localizer. This example might give
- # highlighting problems because it uses different tokenizers for
- # different purposes.
- #
- # $control: 1
- # locale:
- # {
- # # named drivers
- # driver: "insowrap" "inso"
- # driver: "veritytok" "verity"
- #
- # tokenizers:
- # {
- # Extract: driver "inso"
- #
- # Index: driver "verity"
- # View: driver "verity"
- #
- # DynamicHL: LEX
- # {
- # define: NL "[ \t]*\n"
- #
- # token: WORD "[A-Za-z0-9]+" # word
- # token: WORD "[0-9]+\\.[0-9]+" # word
- # token: EOS "[.?!]" # end of sentence
- # token: NEWLINE "{NL}" # single end-of-line
- # token: EOP "{NL}({NL})+" # end of paragraph
- # token: TAB "\t+" # tab
- # token: WHITE " +" # whitespace
- # token: PUNCT "." # all other text
- # }
- #
- # Default: HWLEX
- # }
- # }
- #
- # Example 2: for basic tokenization. This example gives the best
- # overall highlighting fidelity because the same tokenizer is
- # used everywhere.
- #
- # $control: 1
- # locale:
- # {
- # # named drivers
- # driver: "insowrap" "inso"
- #
- # tokenizers:
- # {
- # # use this tokenizer for everything
- # Default: driver "inso"
- # }
- # }
- }
-
- #
- # Description: Specify the attributes of each character. (The "ctype"
- # table for those that know C.)
- #
- # Each entry is a bit field combination of the following:
- # LOC_UP 0x01 /* upper case letter */
- # LOC_LW 0x02 /* lower case letter */
- # LOC_NM 0x04 /* digit[0-9] */
- # LOC_SP 0x08 /* whitespace */
- # LOC_PT 0x10 /* char sometimes used as punctuation */
- # LOC_AL 0x20 /* alphabetic char (ie. non-punctuation) */
- # (The above list was extracted from vdk_loc.h)
- #
- # Equivalent: Ctype() function in the driver
- #
- # Note: This is an 8bit table and can only be used for locales
- # that specify a character set that is of type singlebyte.
- #
- # Default: English ctype table in charset 850
- #
- # This keyword is optional.
- table: CHARTYPE
- {
- text: 0 "\x00\x01\x02 ..."
- text: 16 "\x00\x01\x02 ..."
- }
-
- #
- # Description: Specify the attributes of each character.
- # (The "ctype" table for those that know C.)
- #
- # Each entry is a bit field combination of the following:
- # LOC_UP 0x01 /* upper case letter */
- # LOC_LW 0x02 /* lower case letter */
- # LOC_NM 0x04 /* digit[0-9] */
- # LOC_SP 0x08 /* whitespace */
- # LOC_PT 0x10 /* char sometimes used as punctuation */
- # LOC_AL 0x20 /* alphabetic char (ie. non-punctuation) */
- # (The above list was extracted from vdk_loc.h)
- #
- # Equivalent: Ctype() function in the driver
- #
- # Note: This is a multibyte mapping table, and can be used for
- # any locale. However, it is much more efficient to use the table above
- # if you are defining an 8bit locale.
- #
- # Default: English ctype table in charset 850
- #
- # This keyword is optional.
- mtable: CHARTYPE
- {
- map: "char" "bit_field"
- ...
- }
-
- #
- # Description: Specify an upper-case mapping for each character.
- #
- # Equivalent: You must define all of the following in the driver:
- # UpperCopy()
- # ToUpper()
- # strcmp()
- # stricmp()
- # strncmp()
- # strnicmp()
- #
- # Note: All characters must be given an upper-cased equivalent.
- # This is an 8bit table and can only be used for locales
- # that specify a character set that is of type singlebyte.
- #
- # Default: English upper-case table in charset 850
- #
- # This keyword is optional.
- table: TOUPPER
- {
- text: 0 "\x00\x01\x02 ..."
- text: 16 "\x00\x01\x02 ..."
- ...
- text: 240 "\x00\x01\x02 ..."
- }
-
- #
- # Description: Specify an upper-case mapping for each character.
- #
- # Equivalent: You must define all of the following in the driver:
- # UpperCopy()
- # ToUpper()
- # strcmp()
- # stricmp()
- # strncmp()
- # strnicmp()
- #
- # Note: Only characters that have an upper case version need to given an
- # upper-case mapping. All other characters are assumed to stay the same.
- # This is a multibyte mapping table, and can be used for
- # any locale. However, it is much more efficient to use the table above
- # if you are defining an 8bit locale.
- #
- # Default: English upper-case table in charset 850
- #
- # This keyword is optional.
- mtable: TOUPPER
- {
- map: "from_char" "to_char"
- ...
- }
-
- #
- # Specify a lower-case mapping for each character.
- #
- # Equivalent: You must define all of the following in the driver:
- # LowerCopy()
- # ToLower()
- # strcmp()
- # stricmp()
- # strncmp()
- # strnicmp()
- #
- # Note:All characters must be given an lower-cased equivalent.
- # This is an 8bit table and can only be used for locales
- # that specify a character set that is of type singlebyte.
- #
- # Default: English lower-case table in charset 850
- #
- # This keyword is optional.
- table: TOLOWER
- {
- text:
- ...
- }
-
- #
- # Description: Specify an lower-case mapping for each character.
- #
- # Equivalent: You must define all of the following in the driver:
- # LowerCopy()
- # ToLower()
- # strcmp()
- # stricmp()
- # strncmp()
- # strnicmp()
- #
- # Note: Only characters that have an lower case version need to given an
- # lower-case mapping. All other characters are assumed to stay the same.
- # This is a multibyte mapping table, and can be used for
- # any locale. However, it is much more efficient to use the table above
- # if you are defining an 8bit locale.
- #
- # Default: English lower-case table in charset 850
- #
- # This keyword is optional.
- mtable: TOLOWER
- {
- map: "from_char" "to_char"
- ...
- }
-
- #
- # Description: Specify an sort order for each character in the character set.
- # This is an 8bit table and can only be used for locales
- # that specify a character set that is of type singlebyte.
- #
- # Equivalent: SortOrderValue() and OrdinalChar() in the driver
- #
- # Note: ALL characters in the character set must be listed here.
- #
- # Default: English sort order in charset 850
- #
- # This keyword is optional.
- table: SORTORDER
- {
- text: 0 "\x00\x01\x02 ..."
- text: 16 "\x00\x01\x02 ..."
- }
-
- #
- # Description: Specify an sort order for each character in the character set.
- #
- # Equivalent: SortOrderValue() and OrdinalChar() in the driver
- #
- # Note: ALL characters in the character set must be listed here.
- # This is a multibyte table, and can be used for any locale. However,
- # it is much more efficient to use the table above if you are
- # defining an 8bit locale.
- #
- # Default: English sort order in charset 850
- #
- # This keyword is optional.
- sortorder:
- {
- char: "char"
- ...
- }
-
- #
- # Description: Specify a stem rules table.
- #
- # Equivalent: StemCopy() function in the driver.
- #
- # Note: If you are defining a multibyte locale, you must instead provide a
- # StemCopy() function in the locale driver.
- # This is an 8bit table and can only be used for locales
- # that specify a character set that is of type singlebyte.
- #
- # Default: English 8bit stem rules
- #
- # This keyword is optional.
- stemtable:
- /minlen = <int>
- {
- map: "orig_text" "new_text" "goto_label" [<minlen>]
- /position = <int>
- ...
- doubles: "goto_label" [<minlen>]
- /position = <int>
- ...
- label: "label"
- /position = <int>
- /minlen = <int>
- ...
- }
- }
- $$
-